I will be using a dataset of comments from Wikipedia’s talk page edits which is provided by Kaggle for competition. You can reach data at the link below.
https://www.kaggle.com/c/jigsaw-toxic-comment-classification-challenge/data
import time
import warnings
import pandas as pd, numpy as np
%matplotlib inline
from sklearn.utils import shuffle
import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib.gridspec as gridspec
#color = sns.color_palette()
#from wordcloud import WordCloud ,STOPWORDS
#from PIL import Image
import re
from nltk.tokenize import TweetTokenizer
from nltk.stem.wordnet import WordNetLemmatizer
color = sns.color_palette()
import nltk
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
#import locale;
#print(locale.getdefaultlocale());
from IPython.display import Image
from IPython.core.display import HTML
from sklearn.metrics import accuracy_score
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score
from sklearn.metrics import confusion_matrix
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import AdaBoostClassifier
from sklearn.model_selection import cross_val_score
from sklearn.naive_bayes import MultinomialNB
from scipy.sparse import hstack
nltk.download('wordnet')
nltk.download('stopwords')
eng_stopwords = set(stopwords.words("english"))
warnings.filterwarnings("ignore")
tokenizer=TweetTokenizer()
lem = WordNetLemmatizer()
df = pd.read_csv('/Users/yetkineser/Desktop/BDA 502/project/data/train.csv')
df = shuffle(df,random_state=7)
df_others = df.iloc[20000:,]
df = df.iloc[:20000,]
df = df.reset_index(drop=True)
df_others = df_others.reset_index(drop=True)
df.head(15)
df['comment_text'][3]
df['comment_text'][7]
lengths = df.comment_text.str.len()
lengths.mean(), lengths.std(), lengths.max()
lengths.hist();
label_cols = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']
df['none'] = 1-df[label_cols].max(axis=1)
df['any'] = df[label_cols].max(axis=1)
df.describe()
label_cols = ['any','toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']
for col in label_cols:
count = df.groupby(col)['any'].count()
new_df = pd.concat([count], axis=1)
new_df.columns = ['count']
display(new_df.sort_values(by=['count'],ascending=False))
print("Total comments = ",len(df))
print("Total clean comments = ",len(df)-df['any'].sum())
x=df.iloc[:,2:10].sum()
#plot
plt.figure(figsize=(8,4))
ax= sns.barplot(x.index, x.values, alpha=0.8)
plt.title("# per class")
plt.ylabel('# of Occurrences', fontsize=12)
plt.xlabel('Type ', fontsize=12)
#adding the text labels
rects = ax.patches
labels = x.values
for rect, label in zip(rects, labels):
height = rect.get_height()
ax.text(rect.get_x() + rect.get_width()/2, height + 5, label, ha='center', va='bottom')
plt.show()
rowsums=df.iloc[:,2:].sum(axis=1)
x=rowsums.value_counts()
#plot
plt.figure(figsize=(8,4))
ax = sns.barplot(x.index, x.values, alpha=0.8,color=color[2])
plt.title("Multiple tags per comment")
plt.ylabel('# of Occurrences', fontsize=12)
plt.xlabel('# of tags ', fontsize=12)
#adding the text labels
rects = ax.patches
labels = x.values
for rect, label in zip(rects, labels):
height = rect.get_height()
ax.text(rect.get_x() + rect.get_width()/2, height + 5, label, ha='center', va='bottom')
plt.show()
print("Check for missing values in Train dataset")
null_check=df.isnull().sum()
print(null_check)
There are not null values in our dataset.
Looking first five rows in our dataset
df.head()
temp_df=df.iloc[:,2:-3]
# filter temp by removing clean comments
# temp_df=temp_df[~train.clean]
corr=temp_df.corr()
plt.figure(figsize=(10,8))
sns.heatmap(corr,
xticklabels=corr.columns.values,
yticklabels=corr.columns.values, annot=True)
PATH = "/Users/yetkineser/Desktop/BDA 502/project/photos/"
Image(filename = PATH + "crossvalidation.png", width=800, height=600)
total_rows = (len(df))
train_rows = round(0.8*total_rows)
train = df.iloc[:train_rows,]
train_others = train # i added this later
test = df.iloc[train_rows-total_rows:,]
train_2=train.iloc[:,0:2]
test_2=test.iloc[:,0:2]
df_2=df.iloc[:,0:2]
df_2=df_2.reset_index(drop=True)
print("- I have ",total_rows, " rows on my data set ")
print("- I have ",train_rows, " rows on my first train set ")
print("- I have ",total_rows-train_rows, " rows on my test set ")
#https://drive.google.com/file/d/0B1yuv8YaUVlZZ1RzMFJmc1ZsQmM/view
# Aphost lookup dict
APPO = {
"aren't" : "are not",
"can't" : "can not",
"couldn't" : "could not",
"didn't" : "did not",
"doesn't" : "does not",
"don't" : "do not",
"hadn't" : "had not",
"hasn't" : "has not",
"haven't" : "have not",
"he'd" : "he would",
"he'll" : "he will",
"he's" : "he is",
"i'd" : "I would",
"i'd" : "I had",
"i'll" : "I will",
"i'm" : "I am",
"isn't" : "is not",
"it's" : "it is",
"it'll":"it will",
"i've" : "I have",
"let's" : "let us",
"mightn't" : "might not",
"mustn't" : "must not",
"shan't" : "shall not",
"she'd" : "she would",
"she'll" : "she will",
"she's" : "she is",
"shouldn't" : "should not",
"that's" : "that is",
"there's" : "there is",
"they'd" : "they would",
"they'll" : "they will",
"they're" : "they are",
"they've" : "they have",
"we'd" : "we would",
"we're" : "we are",
"weren't" : "were not",
"we've" : "we have",
"what'll" : "what will",
"what're" : "what are",
"what's" : "what is",
"what've" : "what have",
"where's" : "where is",
"who'd" : "who would",
"who'll" : "who will",
"who're" : "who are",
"who's" : "who is",
"who've" : "who have",
"won't" : "will not",
"wouldn't" : "would not",
"you'd" : "you would",
"you'll" : "you will",
"you're" : "you are",
"you've" : "you have",
"'re": " are",
"wasn't": "was not",
"we'll":" will",
"didn't": "did not",
"tryin'":"trying"
}
corpus=df_2.comment_text
train_text = train_2.comment_text
test_text = test_2.comment_text
def clean(comment):
"""
This function receives comments and returns clean word-list
"""
#Convert to lower case , so that Hi and hi are the same
comment=comment.lower()
#remove \n
comment=re.sub("\\n"," ",comment)
comment=re.sub("/"," ",comment) # added
comment=re.sub("•"," ",comment) # added
# remove leaky elements like ip,user
comment=re.sub("\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}","",comment)
#removing usernames
comment=re.sub("\[\[.*\]","",comment)
#Split the sentences into words
words=tokenizer.tokenize(comment)
# (')aphostophe replacement (ie) you're --> you are
# ( basic dictionary lookup : master dictionary present in a hidden block of code)
words=[APPO[word] if word in APPO else word for word in words]
words=[lem.lemmatize(word, "v") for word in words]
words = [w for w in words if not w in eng_stopwords]
clean_sent=" ".join(words)
# remove any non alphanum,digit character
clean_sent=re.sub("\W+"," ",clean_sent)
clean_sent=re.sub(" "," ",clean_sent)
clean_sent=re.sub(r'[0-9]+', '', clean_sent)
return(clean_sent)
corpus.iloc[5]
clean(corpus.iloc[5])
clean_corpus=corpus.apply(lambda x :clean(x))
def top_feats_by_class(Xtr, y, features, min_tfidf=0.1, top_n=25):
''' Return a list of dfs, where each df holds top_n features and their mean tfidf value
calculated across documents with the same class label. '''
dfs = []
labels = np.unique(y)
for label in labels:
ids = np.where(y==label)
feats_df = top_mean_feats(Xtr, features, ids, min_tfidf=min_tfidf, top_n=top_n)
feats_df.label = label
dfs.append(feats_df)
return dfs
tfv = TfidfVectorizer(min_df=200, max_features=10000,
strip_accents='unicode', analyzer='word',ngram_range=(1,1),
use_idf=1,smooth_idf=1,sublinear_tf=1,
stop_words = 'english')
tfv.fit(clean_corpus)
features = np.array(tfv.get_feature_names())
df_unigrams = tfv.transform(clean_corpus.iloc[:df.shape[0]])
#serperate train and test features
df_feats=df.iloc[0:len(df),]
#join the tags
df_tags=df.iloc[:,2:]
df_feats=pd.concat([df_feats,df_tags],axis=1)
def top_tfidf_feats(row, features, top_n=50):
''' Get top n tfidf values in row and return them with their corresponding feature names.'''
topn_ids = np.argsort(row)[::-1][:top_n]
top_feats = [(features[i], row[i]) for i in topn_ids]
df = pd.DataFrame(top_feats)
df.columns = ['feature', 'tfidf']
return df
def top_feats_in_doc(Xtr, features, row_id, top_n=50):
''' Top tfidf features in specific document (matrix row) '''
row = np.squeeze(Xtr[row_id].toarray())
return top_tfidf_feats(row, features, top_n)
def top_mean_feats(Xtr, features, grp_ids, min_tfidf=0.1, top_n=50):
''' Return the top n features that on average are most important amongst documents in rows
indentified by indices in grp_ids. '''
D = Xtr[grp_ids].toarray()
D[D < min_tfidf] = 0
tfidf_means = np.mean(D, axis=0)
return top_tfidf_feats(tfidf_means, features, top_n)
# modified for multilabel milticlass
def top_feats_by_class(Xtr, features, min_tfidf=0.005, top_n=50):
''' Return a list of dfs, where each df holds top_n features and their mean tfidf value
calculated across documents with the same class label. '''
dfs = []
cols=df_tags.columns
for col in cols:
ids = df_tags.index[df_tags[col]==1]
feats_df = top_mean_feats(Xtr, features, ids, min_tfidf=min_tfidf, top_n=top_n)
feats_df.label = label
dfs.append(feats_df)
return dfs
tfidf_top_n_per_lass=top_feats_by_class(df_unigrams,features)
from sklearn.feature_extraction.text import TfidfVectorizer
word_vectorizer = TfidfVectorizer(
sublinear_tf=True,
strip_accents='unicode',
analyzer='word',
token_pattern=r'\w{1,}',
stop_words='english',
ngram_range=(1, 1),
max_features=10000)
word_vectorizer.fit(corpus)
train_word_features = word_vectorizer.transform(train_text)
test_word_features = word_vectorizer.transform(test_text)
char_vectorizer = TfidfVectorizer(
sublinear_tf=True,
strip_accents='unicode',
analyzer='char',
stop_words='english',
ngram_range=(2, 6),
max_features=50000)
char_vectorizer.fit(corpus)
train_char_features = char_vectorizer.transform(train_text)
test_char_features = char_vectorizer.transform(test_text)
Image(filename = PATH + "metrics.png", width=700, height=700)
Image(filename = PATH + "logistic regression.png", width=700, height=600)
from sklearn.linear_model import LogisticRegression
class_names = ['any']
from sklearn.model_selection import cross_val_score
from scipy.sparse import hstack
train_features = hstack([train_char_features, train_word_features])
test_features = hstack([test_char_features, test_word_features])
scores = []
scores2 = []
scores3 = []
test_pred = pd.DataFrame.from_dict({'id': test['id']})
train_pred = pd.DataFrame.from_dict({'id': train['id']})
time1=time.time()
for class_name in class_names:
train_target = train[class_name]
classifier = LogisticRegression(C=5, solver='sag')
cv_score = cross_val_score(classifier, train_features, train_target, cv=5, scoring='accuracy')
scores.append(cv_score)
cv_score = cross_val_score(classifier, train_features, train_target, cv=5, scoring='precision')
scores2.append(cv_score)
cv_score = cross_val_score(classifier, train_features, train_target, cv=5, scoring='recall')
scores3.append(cv_score)
time2=time.time()
print('Average CV accuracy is {}'.format(round(np.mean(scores),5)))
print('Standard Deviation of CV accuracy is {}'.format(round(np.std(scores),5)))
print('Average CV precion is {}'.format(round(np.mean(scores2),5)))
print('Standard Deviation of CV precision is {}'.format(round(np.std(scores2),5)))
print('Average CV recall is {}'.format(round(np.mean(scores3),5)))
print('Standard Deviation of CV recall is {}'.format(round(np.std(scores3),5)))
print("Time of cross validation",round(time2-time1,5))
classifier = LogisticRegression(C=5, solver='sag')
time1=time.time()
classifier.fit(train_features, train_target)
time2=time.time()
train_pred[class_name] = classifier.predict_proba(train_features)[:, 1]
time3=time.time()
test_pred[class_name] = classifier.predict_proba(test_features)[:, 1]
time4=time.time()
print("Training time = ",round(time2-time1,5))
print("Test time = ",round(time4-time3,5))
test["pred"]=test_pred["any"]>0.5
train["pred"]=train_pred["any"]>0.5
print("Accuracy score of train set : ",round(accuracy_score(train["any"], train["pred"]),5))
print("Accuracy score of test set : ",round(accuracy_score(test["any"], test["pred"]),5))
print("Precision score of train set : ",round(precision_score(train["any"], train["pred"]),5))
print("Precision score of test set : ",round(precision_score(test["any"], test["pred"]),5))
print("Recall score of train set : ",round(recall_score(train["any"], train["pred"]),5))
print("Recall score of test set : ",round(recall_score(test["any"], test["pred"]),5))
# calculate the fpr and tpr for all thresholds of the classification
from sklearn import metrics
preds = test["pred"]
fpr, tpr, threshold = metrics.roc_curve(test["any"], test_pred["any"])
fpr_t, tpr_t, threshold_t = metrics.roc_curve(train["any"], train_pred["any"])
roc_auc = metrics.auc(fpr, tpr)
roc_auc_t = metrics.auc(fpr_t, tpr_t)
# method I: plt
import matplotlib.pyplot as plt
plt.title('Receiver Operating Characteristic')
plt.plot(fpr, tpr, 'b', label = 'TEST_AUC = %0.5f' % roc_auc)
plt.plot(fpr_t, tpr_t, 'r', label = 'TRAIN_AUC = %0.5f' % roc_auc_t)
plt.legend(loc = 'lower right')
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0, 1])
plt.ylim([0, 1])
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.show()
confusion_matrix_train = confusion_matrix(train["any"], train["pred"])
print(confusion_matrix_train)
confusion_matrix_test = confusion_matrix(test["any"], test["pred"])
print(confusion_matrix_test)
class_names = ['any']
train_features = hstack([train_char_features, train_word_features])
test_features = hstack([test_char_features, test_word_features])
scores = []
scores2 = []
scores3 = []
test_pred = pd.DataFrame.from_dict({'id': test['id']})
train_pred = pd.DataFrame.from_dict({'id': train['id']})
time1=time.time()
for class_name in class_names:
train_target = train[class_name]
classifier = MultinomialNB()
cv_score = cross_val_score(classifier, train_features, train_target, cv=5, scoring='accuracy')
scores.append(cv_score)
cv_score = cross_val_score(classifier, train_features, train_target, cv=5, scoring='precision')
scores2.append(cv_score)
cv_score = cross_val_score(classifier, train_features, train_target, cv=5, scoring='recall')
scores3.append(cv_score)
time2=time.time()
print('Average CV accuracy is {}'.format(round(np.mean(scores),5)))
print('Standard Deviation of CV accuracy is {}'.format(round(np.std(scores),5)))
print('Average CV precion is {}'.format(round(np.mean(scores2),5)))
print('Standard Deviation of CV precision is {}'.format(round(np.std(scores2),5)))
print('Average CV recall is {}'.format(round(np.mean(scores3),5)))
print('Standard Deviation of CV recall is {}'.format(round(np.std(scores3),5)))
print("Time of cross validation",round(time2-time1,5))
classifier = MultinomialNB()
time1=time.time()
classifier.fit(train_features, train_target)
time2=time.time()
train_pred[class_name] = classifier.predict_proba(train_features)[:, 1]
time3=time.time()
test_pred[class_name] = classifier.predict_proba(test_features)[:, 1]
time4=time.time()
print("Training time = ",round(time2-time1,5))
print("Test time = ",round(time4-time3,5))
test["pred"]=test_pred["any"]>0.5
train["pred"]=train_pred["any"]>0.5
print("Accuracy score of train set : ",round(accuracy_score(train["any"], train["pred"]),5))
print("Accuracy score of test set : ",round(accuracy_score(test["any"], test["pred"]),5))
print("Precision score of train set : ",round(precision_score(train["any"], train["pred"]),5))
print("Precision score of test set : ",round(precision_score(test["any"], test["pred"]),5))
print("Recall score of train set : ",round(recall_score(train["any"], train["pred"]),5))
print("Recall score of test set : ",round(recall_score(test["any"], test["pred"]),5))
# calculate the fpr and tpr for all thresholds of the classification
from sklearn import metrics
preds = test["pred"]
fpr, tpr, threshold = metrics.roc_curve(test["any"], test_pred["any"])
fpr_t, tpr_t, threshold_t = metrics.roc_curve(train["any"], train_pred["any"])
roc_auc = metrics.auc(fpr, tpr)
roc_auc_t = metrics.auc(fpr_t, tpr_t)
# method I: plt
import matplotlib.pyplot as plt
plt.title('Receiver Operating Characteristic')
plt.plot(fpr, tpr, 'b', label = 'TEST_AUC = %0.5f' % roc_auc)
plt.plot(fpr_t, tpr_t, 'r', label = 'TRAIN_AUC = %0.5f' % roc_auc_t)
plt.legend(loc = 'lower right')
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0, 1])
plt.ylim([0, 1])
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.show()
confusion_matrix_train = confusion_matrix(train["any"], train["pred"])
print(confusion_matrix_train)
confusion_matrix_test = confusion_matrix(test["any"], test["pred"])
print(confusion_matrix_test)
class_names = ['any']
train_features = hstack([train_char_features, train_word_features])
test_features = hstack([test_char_features, test_word_features])
scores = []
scores2 = []
scores3 = []
test_pred = pd.DataFrame.from_dict({'id': test['id']})
train_pred = pd.DataFrame.from_dict({'id': train['id']})
time1=time.time()
for class_name in class_names:
train_target = train[class_name]
classifier = AdaBoostClassifier()
cv_score = cross_val_score(classifier, train_features, train_target, cv=5, scoring='accuracy')
scores.append(cv_score)
cv_score = cross_val_score(classifier, train_features, train_target, cv=5, scoring='precision')
scores2.append(cv_score)
cv_score = cross_val_score(classifier, train_features, train_target, cv=5, scoring='recall')
scores3.append(cv_score)
time2=time.time()
print('Average CV accuracy is {}'.format(round(np.mean(scores),5)))
print('Standard Deviation of CV accuracy is {}'.format(round(np.std(scores),5)))
print('Average CV precion is {}'.format(round(np.mean(scores2),5)))
print('Standard Deviation of CV precision is {}'.format(round(np.std(scores2),5)))
print('Average CV recall is {}'.format(round(np.mean(scores3),5)))
print('Standard Deviation of CV recall is {}'.format(round(np.std(scores3),5)))
print("Time of cross validation",round(time2-time1,5))
classifier = AdaBoostClassifier()
time1=time.time()
classifier.fit(train_features, train_target)
time2=time.time()
train_pred[class_name] = classifier.predict_proba(train_features)[:, 1]
time3=time.time()
test_pred[class_name] = classifier.predict_proba(test_features)[:, 1]
time4=time.time()
print("Training time = ",round(time2-time1,5))
print("Test time = ",round(time4-time3,5))
test["pred"]=test_pred["any"]>0.5
train["pred"]=train_pred["any"]>0.5
print("Accuracy score of train set : ",round(accuracy_score(train["any"], train["pred"]),5))
print("Accuracy score of test set : ",round(accuracy_score(test["any"], test["pred"]),5))
print("Precision score of train set : ",round(precision_score(train["any"], train["pred"]),5))
print("Precision score of test set : ",round(precision_score(test["any"], test["pred"]),5))
print("Recall score of train set : ",round(recall_score(train["any"], train["pred"]),5))
print("Recall score of test set : ",round(recall_score(test["any"], test["pred"]),5))
# calculate the fpr and tpr for all thresholds of the classification
from sklearn import metrics
preds = test["pred"]
fpr, tpr, threshold = metrics.roc_curve(test["any"], test_pred["any"])
fpr_t, tpr_t, threshold_t = metrics.roc_curve(train["any"], train_pred["any"])
roc_auc = metrics.auc(fpr, tpr)
roc_auc_t = metrics.auc(fpr_t, tpr_t)
# method I: plt
import matplotlib.pyplot as plt
plt.title('Receiver Operating Characteristic')
plt.plot(fpr, tpr, 'b', label = 'TEST_AUC = %0.5f' % roc_auc)
plt.plot(fpr_t, tpr_t, 'r', label = 'TRAIN_AUC = %0.5f' % roc_auc_t)
plt.legend(loc = 'lower right')
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0, 1])
plt.ylim([0, 1])
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.show()
confusion_matrix_train = confusion_matrix(train["any"], train["pred"])
print(confusion_matrix_train)
confusion_matrix_test = confusion_matrix(test["any"], test["pred"])
print(confusion_matrix_test)
Image(filename = PATH + "metricsresults1.png", width=800, height=600)
test_recall = test.loc[(test['any'] == 1) & (test['pred']==False)]
test_recall.ix[:, ["comment_text","any","pred"]]
test_recall = test_recall.reset_index(drop=True)
train_recall = train.loc[(train['any'] == 1) & (train['pred']==False)]
test_recall["comment_text"][7]
test_recall["comment_text"][3]
test_recall["comment_text"][12]
Image(filename = PATH + "steps2.png", width=800, height=600)
train_any_1 = train.loc[(train['any'] == 1)]
train_any_1.ix[:, ["comment_text","any","pred"]]
train_any_1 = train_any_1.reset_index(drop=True)
new_train = pd.concat([train, train_any_1, train_any_1, train_any_1,
train_any_1, train_any_1, train_any_1,
train_any_1,train_any_1, train_any_1, train_any_1])
train_rows = (len(new_train))
test_rows = (len(test))
train_rows = round(train_rows)
train = df.iloc[:train_rows,]
train_2=new_train.iloc[:,0:2]
test_2=test.iloc[:,0:2]
df = pd.concat([train_2,test_2])
df_2=df.iloc[:,0:2]
df_2=df_2.reset_index(drop=True)
print("- I have ",train_rows, " rows on my new regenerated train set ")
print("- I have ",test_rows, " rows on my test set ")
corpus=df_2.comment_text
train_text = train_2.comment_text
test_text = test_2.comment_text
clean_corpus=corpus.apply(lambda x :clean(x))
tfv = TfidfVectorizer(min_df=200, max_features=10000,
strip_accents='unicode', analyzer='word',ngram_range=(1,1),
use_idf=1,smooth_idf=1,sublinear_tf=1,
stop_words = 'english')
tfv.fit(clean_corpus)
features = np.array(tfv.get_feature_names())
df_unigrams = tfv.transform(clean_corpus.iloc[:df.shape[0]])
#serperate train and test features
df_feats=df.iloc[0:len(df),]
#join the tags
df_tags=df.iloc[:,2:]
df_feats=pd.concat([df_feats,df_tags],axis=1)
tfidf_top_n_per_lass=top_feats_by_class(df_unigrams,features)
from sklearn.feature_extraction.text import TfidfVectorizer
word_vectorizer = TfidfVectorizer(
sublinear_tf=True,
strip_accents='unicode',
analyzer='word',
token_pattern=r'\w{1,}',
stop_words='english',
ngram_range=(1, 1),
max_features=10000)
word_vectorizer.fit(corpus)
train_word_features = word_vectorizer.transform(train_text)
test_word_features = word_vectorizer.transform(test_text)
char_vectorizer = TfidfVectorizer(
sublinear_tf=True,
strip_accents='unicode',
analyzer='char',
stop_words='english',
ngram_range=(2, 6),
max_features=50000)
char_vectorizer.fit(corpus)
train_char_features = char_vectorizer.transform(train_text)
test_char_features = char_vectorizer.transform(test_text)
train_features = hstack([train_char_features, train_word_features])
test_features = hstack([test_char_features, test_word_features])
train_target = new_train[class_name]
test_pred = pd.DataFrame.from_dict({'id': test['id']})
train_pred = pd.DataFrame.from_dict({'id': new_train['id']})
classifier = LogisticRegression(C=5, solver='sag')
time1=time.time()
classifier.fit(train_features, train_target)
time2=time.time()
train_pred[class_name] = classifier.predict_proba(train_features)[:, 1]
time3=time.time()
test_pred[class_name] = classifier.predict_proba(test_features)[:, 1]
time4=time.time()
test["pred"]=test_pred["any"]>0.5
new_train["pred"]=train_pred["any"]>0.5
print("Training time = ",round(time2-time1,5))
print("Test time = ",round(time4-time3,5))
print("Accuracy score of train set : ",round(accuracy_score(new_train["any"], new_train["pred"]),5))
print("Accuracy score of test set : ",round(accuracy_score(test["any"], test["pred"]),5))
print("Precision score of train set : ",round(precision_score(new_train["any"], new_train["pred"]),5))
print("Precision score of test set : ",round(precision_score(test["any"], test["pred"]),5))
print("Recall score of train set : ",round(recall_score(new_train["any"], new_train["pred"]),5))
print("Recall score of test set : ",round(recall_score(test["any"], test["pred"]),5))
# calculate the fpr and tpr for all thresholds of the classification
from sklearn import metrics
preds = test["pred"]
fpr, tpr, threshold = metrics.roc_curve(test["any"], test_pred["any"])
fpr_t, tpr_t, threshold_t = metrics.roc_curve(new_train["any"], train_pred["any"])
roc_auc = metrics.auc(fpr, tpr)
roc_auc_t = metrics.auc(fpr_t, tpr_t)
# method I: plt
import matplotlib.pyplot as plt
plt.title('Receiver Operating Characteristic')
plt.plot(fpr, tpr, 'b', label = 'TEST_AUC = %0.5f' % roc_auc)
plt.plot(fpr_t, tpr_t, 'r', label = 'TRAIN_AUC = %0.5f' % roc_auc_t)
plt.legend(loc = 'lower right')
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0, 1])
plt.ylim([0, 1])
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.show()
confusion_matrix_test = confusion_matrix(test["any"], test["pred"])
print(confusion_matrix_test)
classifier = MultinomialNB()
time1=time.time()
classifier.fit(train_features, train_target)
time2=time.time()
train_pred[class_name] = classifier.predict_proba(train_features)[:, 1]
time3=time.time()
test_pred[class_name] = classifier.predict_proba(test_features)[:, 1]
time4=time.time()
test["pred"]=test_pred["any"]>0.5
new_train["pred"]=train_pred["any"]>0.5
print("Training time = ",round(time2-time1,5))
print("Test time = ",round(time4-time3,5))
print("Accuracy score of train set : ",round(accuracy_score(new_train["any"], new_train["pred"]),5))
print("Accuracy score of test set : ",round(accuracy_score(test["any"], test["pred"]),5))
print("Precision score of train set : ",round(precision_score(new_train["any"], new_train["pred"]),5))
print("Precision score of test set : ",round(precision_score(test["any"], test["pred"]),5))
print("Recall score of train set : ",round(recall_score(new_train["any"], new_train["pred"]),5))
print("Recall score of test set : ",round(recall_score(test["any"], test["pred"]),5))
# calculate the fpr and tpr for all thresholds of the classification
from sklearn import metrics
preds = test["pred"]
fpr, tpr, threshold = metrics.roc_curve(test["any"], test_pred["any"])
fpr_t, tpr_t, threshold_t = metrics.roc_curve(new_train["any"], train_pred["any"])
roc_auc = metrics.auc(fpr, tpr)
roc_auc_t = metrics.auc(fpr_t, tpr_t)
# method I: plt
import matplotlib.pyplot as plt
plt.title('Receiver Operating Characteristic')
plt.plot(fpr, tpr, 'b', label = 'TEST_AUC = %0.5f' % roc_auc)
plt.plot(fpr_t, tpr_t, 'r', label = 'TRAIN_AUC = %0.5f' % roc_auc_t)
plt.legend(loc = 'lower right')
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0, 1])
plt.ylim([0, 1])
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.show()
confusion_matrix_test = confusion_matrix(test["any"], test["pred"])
print(confusion_matrix_test)
classifier = AdaBoostClassifier()
time1=time.time()
classifier.fit(train_features, train_target)
time2=time.time()
train_pred[class_name] = classifier.predict_proba(train_features)[:, 1]
time3=time.time()
test_pred[class_name] = classifier.predict_proba(test_features)[:, 1]
time4=time.time()
test["pred"]=test_pred["any"]>0.5
new_train["pred"]=train_pred["any"]>0.5
print("Training time = ",round(time2-time1,5))
print("Test time = ",round(time4-time3,5))
print("Accuracy score of train set : ",round(accuracy_score(new_train["any"], new_train["pred"]),5))
print("Accuracy score of test set : ",round(accuracy_score(test["any"], test["pred"]),5))
print("Precision score of train set : ",round(precision_score(new_train["any"], new_train["pred"]),5))
print("Precision score of test set : ",round(precision_score(test["any"], test["pred"]),5))
print("Recall score of train set : ",round(recall_score(new_train["any"], new_train["pred"]),5))
print("Recall score of test set : ",round(recall_score(test["any"], test["pred"]),5))
# calculate the fpr and tpr for all thresholds of the classification
from sklearn import metrics
preds = test["pred"]
fpr, tpr, threshold = metrics.roc_curve(test["any"], test_pred["any"])
fpr_t, tpr_t, threshold_t = metrics.roc_curve(new_train["any"], train_pred["any"])
roc_auc = metrics.auc(fpr, tpr)
roc_auc_t = metrics.auc(fpr_t, tpr_t)
# method I: plt
import matplotlib.pyplot as plt
plt.title('Receiver Operating Characteristic')
plt.plot(fpr, tpr, 'b', label = 'TEST_AUC = %0.5f' % roc_auc)
plt.plot(fpr_t, tpr_t, 'r', label = 'TRAIN_AUC = %0.5f' % roc_auc_t)
plt.legend(loc = 'lower right')
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0, 1])
plt.ylim([0, 1])
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.show()
confusion_matrix_test = confusion_matrix(test["any"], test["pred"])
print(confusion_matrix_test)
Image(filename = PATH + "results2.png", width=800, height=600)
len(df_others)
label_cols = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']
df_others['none'] = 1-df_others[label_cols].max(axis=1)
df_others['any'] = df_others[label_cols].max(axis=1)
df_others_any_1 = df_others.loc[(df_others['any'] == 1)]
df_others_any_1.ix[:, ["comment_text","any","pred"]]
df_others_any_1 = df_others_any_1.reset_index(drop=True)
len(df_others_any_1)
new_train_2 = pd.concat([train_others, df_others_any_1])
len(new_train_2)
train_rows = (len(new_train_2))
test_rows = (len(test))
train_rows = round(train_rows)
train = df.iloc[:train_rows,]
train_2=new_train_2.iloc[:,0:2]
test_2=test.iloc[:,0:2]
df = pd.concat([train_2,test_2])
df_2=df.iloc[:,0:2]
df_2=df_2.reset_index(drop=True)
print("- I have ",train_rows, " rows on my new regenerated train set ")
print("- I have ",test_rows, " rows on my test set ")
corpus=df_2.comment_text
train_text = train_2.comment_text
test_text = test_2.comment_text
clean_corpus=corpus.apply(lambda x :clean(x))
tfv = TfidfVectorizer(min_df=200, max_features=10000,
strip_accents='unicode', analyzer='word',ngram_range=(1,1),
use_idf=1,smooth_idf=1,sublinear_tf=1,
stop_words = 'english')
tfv.fit(clean_corpus)
features = np.array(tfv.get_feature_names())
df_unigrams = tfv.transform(clean_corpus.iloc[:df.shape[0]])
#serperate train and test features
df_feats=df.iloc[0:len(df),]
#join the tags
df_tags=df.iloc[:,2:]
df_feats=pd.concat([df_feats,df_tags],axis=1)
tfidf_top_n_per_lass=top_feats_by_class(df_unigrams,features)
word_vectorizer = TfidfVectorizer(
sublinear_tf=True,
strip_accents='unicode',
analyzer='word',
token_pattern=r'\w{1,}',
stop_words='english',
ngram_range=(1, 1),
max_features=10000)
word_vectorizer.fit(corpus)
train_word_features = word_vectorizer.transform(train_text)
test_word_features = word_vectorizer.transform(test_text)
char_vectorizer = TfidfVectorizer(
sublinear_tf=True,
strip_accents='unicode',
analyzer='char',
stop_words='english',
ngram_range=(2, 6),
max_features=50000)
char_vectorizer.fit(corpus)
train_char_features = char_vectorizer.transform(train_text)
test_char_features = char_vectorizer.transform(test_text)
train_features = hstack([train_char_features, train_word_features])
test_features = hstack([test_char_features, test_word_features])
train_target = new_train_2[class_name]
test_pred = pd.DataFrame.from_dict({'id': test['id']})
train_pred = pd.DataFrame.from_dict({'id': new_train_2['id']})
classifier = LogisticRegression(C=5, solver='sag')
time1=time.time()
classifier.fit(train_features, train_target)
time2=time.time()
train_pred[class_name] = classifier.predict_proba(train_features)[:, 1]
time3=time.time()
test_pred[class_name] = classifier.predict_proba(test_features)[:, 1]
time4=time.time()
test["pred"]=test_pred["any"]>0.5
new_train_2["pred"]=train_pred["any"]>0.5
print("Training time = ",round(time2-time1,5))
print("Test time = ",round(time4-time3,5))
print("Accuracy score of train set : ",round(accuracy_score(new_train_2["any"], new_train_2["pred"]),5))
print("Accuracy score of test set : ",round(accuracy_score(test["any"], test["pred"]),5))
print("Precision score of train set : ",round(precision_score(new_train_2["any"], new_train_2["pred"]),5))
print("Precision score of test set : ",round(precision_score(test["any"], test["pred"]),5))
print("Recall score of train set : ",round(recall_score(new_train_2["any"], new_train_2["pred"]),5))
print("Recall score of test set : ",round(recall_score(test["any"], test["pred"]),5))
# calculate the fpr and tpr for all thresholds of the classification
from sklearn import metrics
preds = test["pred"]
fpr, tpr, threshold = metrics.roc_curve(test["any"], test_pred["any"])
fpr_t, tpr_t, threshold_t = metrics.roc_curve(new_train_2["any"], train_pred["any"])
roc_auc = metrics.auc(fpr, tpr)
roc_auc_t = metrics.auc(fpr_t, tpr_t)
# method I: plt
import matplotlib.pyplot as plt
plt.title('Receiver Operating Characteristic')
plt.plot(fpr, tpr, 'b', label = 'TEST_AUC = %0.5f' % roc_auc)
plt.plot(fpr_t, tpr_t, 'r', label = 'TRAIN_AUC = %0.5f' % roc_auc_t)
plt.legend(loc = 'lower right')
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0, 1])
plt.ylim([0, 1])
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.show()
classifier = MultinomialNB()
time1=time.time()
classifier.fit(train_features, train_target)
time2=time.time()
train_pred[class_name] = classifier.predict_proba(train_features)[:, 1]
time3=time.time()
test_pred[class_name] = classifier.predict_proba(test_features)[:, 1]
time4=time.time()
test["pred"]=test_pred["any"]>0.5
new_train_2["pred"]=train_pred["any"]>0.5
print("Training time = ",round(time2-time1,5))
print("Test time = ",round(time4-time3,5))
print("Accuracy score of train set : ",round(accuracy_score(new_train_2["any"], new_train_2["pred"]),5))
print("Accuracy score of test set : ",round(accuracy_score(test["any"], test["pred"]),5))
print("Precision score of train set : ",round(precision_score(new_train_2["any"], new_train_2["pred"]),5))
print("Precision score of test set : ",round(precision_score(test["any"], test["pred"]),5))
print("Recall score of train set : ",round(recall_score(new_train_2["any"], new_train_2["pred"]),5))
print("Recall score of test set : ",round(recall_score(test["any"], test["pred"]),5))
# calculate the fpr and tpr for all thresholds of the classification
from sklearn import metrics
preds = test["pred"]
fpr, tpr, threshold = metrics.roc_curve(test["any"], test_pred["any"])
fpr_t, tpr_t, threshold_t = metrics.roc_curve(new_train_2["any"], train_pred["any"])
roc_auc = metrics.auc(fpr, tpr)
roc_auc_t = metrics.auc(fpr_t, tpr_t)
# method I: plt
import matplotlib.pyplot as plt
plt.title('Receiver Operating Characteristic')
plt.plot(fpr, tpr, 'b', label = 'TEST_AUC = %0.5f' % roc_auc)
plt.plot(fpr_t, tpr_t, 'r', label = 'TRAIN_AUC = %0.5f' % roc_auc_t)
plt.legend(loc = 'lower right')
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0, 1])
plt.ylim([0, 1])
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.show()
classifier = AdaBoostClassifier()
time1=time.time()
classifier.fit(train_features, train_target)
time2=time.time()
train_pred[class_name] = classifier.predict_proba(train_features)[:, 1]
time3=time.time()
test_pred[class_name] = classifier.predict_proba(test_features)[:, 1]
time4=time.time()
test["pred"]=test_pred["any"]>0.5
new_train_2["pred"]=train_pred["any"]>0.5
print("Training time = ",round(time2-time1,5))
print("Test time = ",round(time4-time3,5))
print("Accuracy score of train set : ",round(accuracy_score(new_train_2["any"], new_train_2["pred"]),5))
print("Accuracy score of test set : ",round(accuracy_score(test["any"], test["pred"]),5))
print("Precision score of train set : ",round(precision_score(new_train_2["any"], new_train_2["pred"]),5))
print("Precision score of test set : ",round(precision_score(test["any"], test["pred"]),5))
print("Recall score of train set : ",round(recall_score(new_train_2["any"], new_train_2["pred"]),5))
print("Recall score of test set : ",round(recall_score(test["any"], test["pred"]),5))
# calculate the fpr and tpr for all thresholds of the classification
from sklearn import metrics
preds = test["pred"]
fpr, tpr, threshold = metrics.roc_curve(test["any"], test_pred["any"])
fpr_t, tpr_t, threshold_t = metrics.roc_curve(new_train_2["any"], train_pred["any"])
roc_auc = metrics.auc(fpr, tpr)
roc_auc_t = metrics.auc(fpr_t, tpr_t)
# method I: plt
import matplotlib.pyplot as plt
plt.title('Receiver Operating Characteristic')
plt.plot(fpr, tpr, 'b', label = 'TEST_AUC = %0.5f' % roc_auc)
plt.plot(fpr_t, tpr_t, 'r', label = 'TRAIN_AUC = %0.5f' % roc_auc_t)
plt.legend(loc = 'lower right')
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0, 1])
plt.ylim([0, 1])
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.show()
Image(filename = PATH + "results3.png", width=800, height=600)